04_describe

WHO 2025 data on Tuberculosis, data description

In this script, it’s intended to give an explanation on data features, using different methods and graphs. The provided data description is based on three datasets:

Overall, the document aims to do some light-weight analysis and data exploration, prior to the heavy-weight analysis in the subsequent 2x analysis files.


Loading relevant libraries:

library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

#Access to the function for loading the datasets and to save them
source("99_proj_func.R")

Loading data:

Joined dataset, year 2024:

#Loading age and sex and risk group data (2024 only) - Joined and augmented version of the data: 

data_file          <- "03_aug_TB_age_sex.tsv"
TB_age_sex_joined  <- load_data(data_file)
Loading ../data/03_aug_TB_age_sex.tsv from local file…
#Display the data: 
slice_sample(TB_age_sex_joined, n=5)
# A tibble: 5 × 18
  country            year age_group sex   risk_factor TB_cases_best TB_cases_min
  <chr>             <dbl> <chr>     <chr> <chr>               <dbl>        <dbl>
1 Iraq               2024 25-34     Male  no risk fa…          1300          290
2 Ireland            2024 35-44     Both  no risk fa…            69           36
3 Republic of Mold…  2024 20-24     Fema… no risk fa…            19            8
4 Kazakhstan         2024 20-24     Male  no risk fa…           240           84
5 Haiti              2024 15+       Fema… no risk fa…          8400         5800
# ℹ 11 more variables: TB_cases_max <dbl>, population_size <dbl>,
#   total_TB_cases_best <dbl>, total_TB_cases_min <dbl>,
#   total_TB_cases_max <dbl>, TB_cases_pr_100k_best <dbl>,
#   TB_cases_pr_100k_min <dbl>, TB_cases_pr_100k_max <dbl>,
#   total_TB_cases_pr_100k_best <dbl>, total_TB_cases_pr_100k_min <dbl>,
#   total_TB_cases_pr_100k_max <dbl>

Joined dataset, year 2015 to 2024:

#Loading age and sex and risk group data (2024 only) - Joined version of the data: 

data_file           <- "03_aug_TB_10_years.tsv"
TB_10_years_joined  <- load_data(data_file)
Loading ../data/03_aug_TB_10_years.tsv from local file…
#Display the data: 
slice_sample(TB_10_years_joined, n=5)
# A tibble: 5 × 75
  country   g_whoregion.x  year e_pop_num e_inc_100k e_inc_100k_lo e_inc_100k_hi
  <chr>     <chr>         <dbl>     <dbl>      <dbl>         <dbl>         <dbl>
1 Bolivia … AMR            2023  12244156      105            86           127  
2 Mongolia  WPR            2018   3167705      428           220           703  
3 Samoa     WPR            2016    203500        6.4           5.2           7.7
4 Cuba      AMR            2017  11247828        7.6           6.2           9.1
5 Hungary   EUR            2018   9774396        8             6.5           9.6
# ℹ 68 more variables: e_inc_num <dbl>, e_inc_num_lo <dbl>, e_inc_num_hi <dbl>,
#   e_tbhiv_prct <dbl>, e_inc_tbhiv_100k <dbl>, e_inc_tbhiv_100k_lo <dbl>,
#   e_inc_tbhiv_100k_hi <dbl>, e_inc_tbhiv_num <dbl>, e_inc_tbhiv_num_lo <dbl>,
#   e_inc_tbhiv_num_hi <dbl>, e_mort_exc_tbhiv_100k <dbl>,
#   e_mort_exc_tbhiv_100k_lo <dbl>, e_mort_exc_tbhiv_100k_hi <dbl>,
#   e_mort_exc_tbhiv_num <dbl>, e_mort_exc_tbhiv_num_lo <dbl>,
#   e_mort_exc_tbhiv_num_hi <dbl>, e_mort_tbhiv_100k <dbl>, …

WHO TB burden estimates [>1Mb]:

#Loading TB burden estimates: 

data_file   <- "02_clean_TB_burden.tsv"
TB_burden   <- load_data(data_file)
Loading ../data/02_clean_TB_burden.tsv from local file…
#Display the data: 
slice_sample(TB_burden, n=5)
# A tibble: 5 × 45
  country   g_whoregion  year e_pop_num e_inc_100k e_inc_100k_lo e_inc_100k_hi
  <chr>     <chr>       <dbl>     <dbl>      <dbl>         <dbl>         <dbl>
1 Lithuania EUR          2000   3500408      181           147           218  
2 Kenya     AFR          2014  46051444      423           183           761  
3 Andorra   EUR          2002     66507        7.5           6.1           9.1
4 Albania   EUR          2023   2811661       15            12            19  
5 Bahrain   EMR          2003    811274       56            46            68  
# ℹ 38 more variables: e_inc_num <dbl>, e_inc_num_lo <dbl>, e_inc_num_hi <dbl>,
#   e_tbhiv_prct <dbl>, e_inc_tbhiv_100k <dbl>, e_inc_tbhiv_100k_lo <dbl>,
#   e_inc_tbhiv_100k_hi <dbl>, e_inc_tbhiv_num <dbl>, e_inc_tbhiv_num_lo <dbl>,
#   e_inc_tbhiv_num_hi <dbl>, e_mort_exc_tbhiv_100k <dbl>,
#   e_mort_exc_tbhiv_100k_lo <dbl>, e_mort_exc_tbhiv_100k_hi <dbl>,
#   e_mort_exc_tbhiv_num <dbl>, e_mort_exc_tbhiv_num_lo <dbl>,
#   e_mort_exc_tbhiv_num_hi <dbl>, e_mort_tbhiv_100k <dbl>, …

Dictionary:

#Loading the data dictionary: 

data_file   <- "01_load_dictionary.tsv"

TB_dictionary <- load_data(data_file)
Loading ../data/01_load_dictionary.tsv from local file…
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
slice_sample(TB_dictionary, n=5)
# A tibble: 5 × 4
  variable_name       dataset               code_list definition                
  <chr>               <chr>                 <chr>     <chr>                     
1 new_sp              Notification          <NA>      New pulmonary smear-posit…
2 miners_screen       Policies and services <NA>      (if miners_screen_data_av…
3 dst_ptd             Laboratories          <NA>      Number of sites providing…
4 newrel_m5564        Notification          <NA>      New and relapse cases (bu…
5 e_inc_tbhiv_100k_hi Estimates             <NA>      Estimated incidence of TB…

Data description:

TB_age_sex_joined (2024) - Description:

This dataset contains the number of TB cases across different countries, categorized by age group and gender. It also includes cases per 100,000 population, enabling standardized and comparable analysis between countries.

slice_sample(TB_age_sex_joined, n=5)
# A tibble: 5 × 18
  country            year age_group sex   risk_factor TB_cases_best TB_cases_min
  <chr>             <dbl> <chr>     <chr> <chr>               <dbl>        <dbl>
1 Madagascar         2024 5-14      Fema… no risk fa…          2200            0
2 United Republic …  2024 25-34     Fema… no risk fa…          7300            0
3 Zimbabwe           2024 0-14      Both  no risk fa…          6100         2500
4 Colombia           2024 10-14     Male  no risk fa…           160           55
5 French Polynesia   2024 20-24     Both  no risk fa…             6            2
# ℹ 11 more variables: TB_cases_max <dbl>, population_size <dbl>,
#   total_TB_cases_best <dbl>, total_TB_cases_min <dbl>,
#   total_TB_cases_max <dbl>, TB_cases_pr_100k_best <dbl>,
#   TB_cases_pr_100k_min <dbl>, TB_cases_pr_100k_max <dbl>,
#   total_TB_cases_pr_100k_best <dbl>, total_TB_cases_pr_100k_min <dbl>,
#   total_TB_cases_pr_100k_max <dbl>

We can briefly explore the big differences between comparing countries based on their total TB cases vs comparing with TB cases pr. 100k citizens.

Scatter plot of the countries with the top 10 most TB cases in total:

#Getting the 10 countries with highest total amount of TB cases:  
top_10_countries <- 
  TB_age_sex_joined |>
  group_by(country) |>
  summarise(total_TB = first(total_TB_cases_best)) |>
  arrange(desc(total_TB)) |>
  slice_head(n = 10) |> 
  pull(country)

#Making a tibble for those countries, and plotting them: 
plot <- TB_age_sex_joined |>
  filter(country %in% top_10_countries) |>
  group_by(country) |>
  summarise(
    mean_best = sum(TB_cases_best, na.rm = TRUE), #Sum of TB cases for this country 
    min_val  = sum(TB_cases_min, na.rm = TRUE),
    max_val  = sum(TB_cases_max, na.rm = TRUE),
  ) |> 
  
  ggplot(aes(x = mean_best, y = fct_reorder(country, mean_best))) +
  #fct_reorder(country, mean_best) ensures that we order sort all countries,
  #based on mean_best value (descending order). 
  
  geom_point(size = 3, color = "orange") +
  geom_errorbarh(aes(xmin = min_val, xmax = max_val), height = 0.2) +
  labs(
    x = "TB cases\n(Best estimate with min/max)",
    y = "Country",
    title = "Top 10 Countries - Total TB cases, with error bars"
  ) +
  theme_minimal()

#Save it

ggsave(
  filename = "../results/04_1_top10_TB.png",   #Choose the folder + filename
  plot = plot,
  width = 8,       # inches
  height = 5,      # inches
  dpi = 300        # high quality
)

plot

Note that really large countries like China and India is part of this graph.

That is not to state that TB is not a problem in these countries, but it is to highlight that the TB intensity might not be as bad as you might think.

This will make sense once you glance at the following plot.

Scatter plot of the countries with the top 10 most TB cases pr. 100k citizens (standardized):

#Making an object for storing the top 10 countries with most TB cases: 
top_10_countries_100k <- TB_age_sex_joined |>
  group_by(country) |>
  summarise(
    total_TB_cases_pr_100k_best = first(total_TB_cases_pr_100k_best),
    #Note: The value is constant for each country, so we just use first()
    #in order to pick the first value (we want to reduce several rows  
    #to 1 row pr. country) 
    ) |>
  arrange(desc(total_TB_cases_pr_100k_best)) |>
  slice_head(n = 10) |>
  pull(country)

#Making a tibble for those countries, and plotting them: 
plot <- TB_age_sex_joined |>
  filter(country %in% top_10_countries_100k) |>
  group_by(country) |>
  summarise(
    mean_best = first(total_TB_cases_pr_100k_best),
    min_val  = first(total_TB_cases_pr_100k_min),
    max_val  = first(total_TB_cases_pr_100k_max),
  ) |> 
  
    ggplot(aes(x = mean_best, y = fct_reorder(country, mean_best))) +
  #fct_reorder(country, mean_best) ensures that we order sort all countries,
  #based on mean_best value (descending order). 
  
  geom_point(size = 3, color = "orange") +
  geom_errorbarh(aes(xmin = min_val, xmax = max_val), height = 0.2) +
  labs(
    x = "TB cases pr. 100k\n(Best estimate with min/max)",
    y = "Country",
    title = "Top 10 Countries - TB cases pr. 100k citizens, with error bars"
  ) +
  theme_minimal()

#Save it

ggsave(
  filename = "../results/04_2_top10_TB_100k.png",   #Choose the folder + filename
  plot = plot,
  width = 8,       # inches
  height = 5,      # inches
  dpi = 300        # high quality
)
plot

Would you look at that!

Except for the Philippines, none of these countries were part of the plot for the “top 10 total TB cases countries”.

It goes to show that the standardized TB cases pr. 100k of citizens might be a better measure for TB disease intensity of a country.

*TB_10_years_joined (2015-2024) - Description:

We combined three WHO datasets — TB burden estimates, MDR/RR-TB burden estimates, and TB infection in household contacts — into a single multi-country, multi-year panel covering approximately 10 years.

The merged dataset contains measures of TB incidence and mortality, MDR/RR-TB incidence, and estimated household infection rates for each country–year.

This integrated dataset allows us to describe global TB trends, compare drug-resistant and drug-sensitive TB, and evaluate household transmission indicators.

(Note: Multidrug-resistant tuberculosis (MDR-TB) is defined as disease due to Mycobacterium tuberculosis that is resistant to isoniazid (H) and rifampicin (R) with or without resistance to other drugs. RR - Rifampicin resistant).

slice_sample(TB_10_years_joined, n=5)
# A tibble: 5 × 75
  country g_whoregion.x  year e_pop_num e_inc_100k e_inc_100k_lo e_inc_100k_hi
  <chr>   <chr>         <dbl>     <dbl>      <dbl>         <dbl>         <dbl>
1 Libya   EMR            2019   6951035       40            24            76  
2 Haiti   AMR            2016  10671458      297           242           358  
3 Italy   EUR            2022  59619113        4.5           3.7           5.5
4 Cyprus  EUR            2018   1270742        5             4.1           6  
5 Peru    AMR            2015  30457601      131           107           158  
# ℹ 68 more variables: e_inc_num <dbl>, e_inc_num_lo <dbl>, e_inc_num_hi <dbl>,
#   e_tbhiv_prct <dbl>, e_inc_tbhiv_100k <dbl>, e_inc_tbhiv_100k_lo <dbl>,
#   e_inc_tbhiv_100k_hi <dbl>, e_inc_tbhiv_num <dbl>, e_inc_tbhiv_num_lo <dbl>,
#   e_inc_tbhiv_num_hi <dbl>, e_mort_exc_tbhiv_100k <dbl>,
#   e_mort_exc_tbhiv_100k_lo <dbl>, e_mort_exc_tbhiv_100k_hi <dbl>,
#   e_mort_exc_tbhiv_num <dbl>, e_mort_exc_tbhiv_num_lo <dbl>,
#   e_mort_exc_tbhiv_num_hi <dbl>, e_mort_tbhiv_100k <dbl>, …
df10 <- TB_10_years_joined #just shortening the name so we don't have to type so much

Definitions of all the variables that were not renamed:

df_colnames <- tibble(colname = colnames(df10))
df_colnames
# A tibble: 75 × 1
   colname      
   <chr>        
 1 country      
 2 g_whoregion.x
 3 year         
 4 e_pop_num    
 5 e_inc_100k   
 6 e_inc_100k_lo
 7 e_inc_100k_hi
 8 e_inc_num    
 9 e_inc_num_lo 
10 e_inc_num_hi 
# ℹ 65 more rows
dict_filtered <- TB_dictionary[c("variable_name", "definition")] |> 
   filter(variable_name %in% (df_colnames |> 
                                pull(colname)))
print(dict_filtered)
# A tibble: 43 × 2
   variable_name definition                                                     
   <chr>         <chr>                                                          
 1 country       Country or territory name                                      
 2 rr_new        Number of new bacteriologically confirmed pulmonary TB patient…
 3 c_newinc_100k Case notification rate, which is the total of new and relapse …
 4 cfr           Estimated TB case fatality ratio                               
 5 cfr_hi        Estimated TB case fatality ratio: high bound                   
 6 cfr_lo        Estimated TB case fatality ratio: low bound                    
 7 cfr_pct       Estimated TB case fatality ratio expressed as a percentage     
 8 cfr_pct_hi    Estimated TB case fatality ratio: high bound expressed as a pe…
 9 cfr_pct_lo    Estimated TB case fatality ratio: low bound expressed as a per…
10 e_inc_100k    Estimated incidence (all forms) per 100 000 population         
# ℹ 33 more rows

After considering the definitions, we can select only the important variables that hold the most descriptive information to get to know the data. We are selecting data per 100k to get the most comparable summary.

df10_selection <- df10 |> 
      select(
        country,
        year,
        case_fatality_ratio = cfr,
        incidence_100k = e_inc_100k,
        incidence_hiv_100k = e_inc_tbhiv_100k,
        mortality_100k = e_mort_100k,
        mortality_hiv_100k = e_mort_tbhiv_100k,
        mortality_nohiv_100k = e_mort_exc_tbhiv_100k,
        rr_treated,
        rr_incidence,
        rr_labconf_pulm,
        contacts_best = Contacts_best,
        preventive_tx = Preventive_Tx_Pct
)
df10_selection
# A tibble: 1,847 × 13
   country      year case_fatality_ratio incidence_100k incidence_hiv_100k
   <chr>       <dbl>               <dbl>          <dbl>              <dbl>
 1 Afghanistan  2015                0.22            200               0.05
 2 Afghanistan  2016                0.19            204               0.06
 3 Afghanistan  2017                0.18            209               0.06
 4 Afghanistan  2018                0.18            212               0.06
 5 Afghanistan  2019                0.17            213               0.06
 6 Afghanistan  2020                0.2             205               0.03
 7 Afghanistan  2021                0.2             206               0.01
 8 Afghanistan  2022                0.19            206               0.06
 9 Afghanistan  2023                0.19            204               0.05
10 Afghanistan  2024                0.19            203               0.08
# ℹ 1,837 more rows
# ℹ 8 more variables: mortality_100k <dbl>, mortality_hiv_100k <dbl>,
#   mortality_nohiv_100k <dbl>, rr_treated <dbl>, rr_incidence <dbl>,
#   rr_labconf_pulm <dbl>, contacts_best <dbl>, preventive_tx <dbl>

Summary statistics for main TB variables

df10_selection %>%
  reframe(across(where(is.numeric),
                   list(mean = mean, sd = sd),
                   na.rm = TRUE))
Warning: There was 1 warning in `reframe()`.
ℹ In argument: `across(where(is.numeric), list(mean = mean, sd = sd), na.rm =
  TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
# A tibble: 1 × 24
  year_mean year_sd case_fatality_ratio_mean case_fatality_ratio_sd
      <dbl>   <dbl>                    <dbl>                  <dbl>
1     2019.    2.86                    0.139                  0.103
# ℹ 20 more variables: incidence_100k_mean <dbl>, incidence_100k_sd <dbl>,
#   incidence_hiv_100k_mean <dbl>, incidence_hiv_100k_sd <dbl>,
#   mortality_100k_mean <dbl>, mortality_100k_sd <dbl>,
#   mortality_hiv_100k_mean <dbl>, mortality_hiv_100k_sd <dbl>,
#   mortality_nohiv_100k_mean <dbl>, mortality_nohiv_100k_sd <dbl>,
#   rr_treated_mean <dbl>, rr_treated_sd <dbl>, rr_incidence_mean <dbl>,
#   rr_incidence_sd <dbl>, rr_labconf_pulm_mean <dbl>, …

Summary table of NA values (of selected values for better clarity):

missing_summary <- df10_selection %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") %>%
  arrange(desc(n_missing))
missing_summary
# A tibble: 13 × 2
   variable             n_missing
   <chr>                    <int>
 1 preventive_tx              661
 2 incidence_hiv_100k         133
 3 case_fatality_ratio        113
 4 mortality_hiv_100k         113
 5 mortality_nohiv_100k       113
 6 rr_treated                 113
 7 rr_incidence               113
 8 rr_labconf_pulm            113
 9 country                      0
10 year                         0
11 incidence_100k               0
12 mortality_100k               0
13 contacts_best                0

Heatmap which highlights where values are missing:

missing_long <- df10_selection%>%
  mutate(row = row_number()) %>%
  mutate(across(everything(), as.character)) %>%   #convert values to character
  pivot_longer(-row, names_to = "variable", values_to = "value") %>%
  mutate(is_missing = is.na(value) | value == "NA")


ggplot(missing_long, aes(x = row, y = variable, fill = is_missing)) +
  geom_tile() +
  scale_fill_manual(values = c("FALSE" = "white", "TRUE" = "black")) +
  labs(title = "Missing values - Heatmap",
       x = "Row",
       y = "Variable",
       fill = "Missing") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),    # hide x labels 
        axis.ticks.x = element_blank())

Missing values per country:

#Countries with most missing values: 
missing_by_country <- df10_selection |>
  group_by(country) |>
  summarise(
    total_rows = n(),
    total_missing = sum(is.na(across(everything()))),
    prop_missing = total_missing / (total_rows * ncol(df10))
  ) |>
  arrange(desc(prop_missing)) |> 
  slice_head(n=10)

#Top 10 countries with most missing values:
df_top10 <- df10_selection |>
  filter(country %in% missing_by_country$country) |>
  mutate(row = row_number())

#Prepare for missing values heatmap: 
missing_long <- df_top10 |>
  mutate(across(-c(country, row), ~ is.na(.))) |>  # logical missing values
  pivot_longer(-c(country, row), names_to = "variable", values_to = "is_missing")

#Plot heatmap
ggplot(missing_long, aes(x = variable, y = row, fill = is_missing)) +
  geom_tile() +
  facet_wrap(~ country, scales = "free_y") +  # separate heatmap per country
  scale_fill_manual(values = c("FALSE" = "white", "TRUE" = "black")) +
  labs(
    title = "Missing values - Heatmap for countries with the top 10 most missing values",
    x = "Variable",
    y = "Row",
    fill = "Missing"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    strip.text = element_text(face = "bold")
  )

Global average of trends over time:

df10_selection
# A tibble: 1,847 × 13
   country      year case_fatality_ratio incidence_100k incidence_hiv_100k
   <chr>       <dbl>               <dbl>          <dbl>              <dbl>
 1 Afghanistan  2015                0.22            200               0.05
 2 Afghanistan  2016                0.19            204               0.06
 3 Afghanistan  2017                0.18            209               0.06
 4 Afghanistan  2018                0.18            212               0.06
 5 Afghanistan  2019                0.17            213               0.06
 6 Afghanistan  2020                0.2             205               0.03
 7 Afghanistan  2021                0.2             206               0.01
 8 Afghanistan  2022                0.19            206               0.06
 9 Afghanistan  2023                0.19            204               0.05
10 Afghanistan  2024                0.19            203               0.08
# ℹ 1,837 more rows
# ℹ 8 more variables: mortality_100k <dbl>, mortality_hiv_100k <dbl>,
#   mortality_nohiv_100k <dbl>, rr_treated <dbl>, rr_incidence <dbl>,
#   rr_labconf_pulm <dbl>, contacts_best <dbl>, preventive_tx <dbl>
global_yearly_trends <- df10_selection |> 
  group_by(year) |> 
  summarise(
    incidence_mean = mean(incidence_100k, na.rm = TRUE),
    rr_incidence_mean = mean(rr_incidence, na.rm = TRUE),
    household_inf_mean = mean(contacts_best, na.rm = TRUE)
  )

global_yearly_trends
# A tibble: 10 × 4
    year incidence_mean rr_incidence_mean household_inf_mean
   <dbl>          <dbl>             <dbl>              <dbl>
 1  2015           133.             3422.             47988.
 2  2016           125.             3143.             49461.
 3  2017           118.             2883.             51400.
 4  2018           113.             2716.             51221.
 5  2019           111.             2600.             55363.
 6  2020           105.             2471.             46000.
 7  2021           103.             2381.             56204.
 8  2022           105.             2326.             65069.
 9  2023           104.             2360.             71849.
10  2024           108.             2344.             78278.

Prepare for plotting - to long shape and standardization, because variables are on different scales (per different number of people)

global_yearly_trends_long <- global_yearly_trends |> 
  mutate(across(
    -year,
    ~ (.x - mean(.x, na.rm = TRUE)) / sd(.x, na.rm = TRUE)
  )) |> 
  
  pivot_longer(
    cols = -year,
    names_to = "variable",
    values_to = "value"
  )

global_yearly_trends_long
# A tibble: 30 × 3
    year variable             value
   <dbl> <chr>                <dbl>
 1  2015 incidence_mean      2.04  
 2  2015 rr_incidence_mean   2.00  
 3  2015 household_inf_mean -0.855 
 4  2016 incidence_mean      1.24  
 5  2016 rr_incidence_mean   1.27  
 6  2016 household_inf_mean -0.719 
 7  2017 incidence_mean      0.583 
 8  2017 rr_incidence_mean   0.578 
 9  2017 household_inf_mean -0.541 
10  2018 incidence_mean      0.0213
# ℹ 20 more rows

Exploratory plots - Global average of TB incidence over time:

ggplot(global_yearly_trends_long, aes(x = year, y = value, color = variable)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = unique(global_yearly_trends_long$year)) +
  labs(
    title = "Global Yearly Trends",
    x = "Year",
    y = "Value",
    color = "Indicator"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Interestingly, household contacts seemed to have dropped rapidly during Covid-19 quarantine (happened around april of 2020), only to increase rapidly in the time afterwards.